{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
    "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
   },
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from keras.preprocessing.text import Tokenizer\n",
    "from keras.preprocessing.sequence import pad_sequences\n",
    "from keras.models import Sequential, load_model\n",
    "from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, GlobalMaxPool1D, Conv1D, Dropout, GlobalAveragePooling1D, Flatten, concatenate, Input\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.utils.np_utils import to_categorical\n",
    "from keras.layers.normalization import BatchNormalization\n",
    "import re\n",
    "from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau\n",
    "from keras import regularizers\n",
    "from keras.models import Model\n",
    "from nltk.corpus import stopwords\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "import seaborn as sb\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "stemmer = WordNetLemmatizer()\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.ticker import StrMethodFormatter\n",
    "from sklearn.metrics import mean_squared_error\n",
    "from math import sqrt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_orig = pd.read_csv('Data_Train02.csv', encoding='ISO-8859-1')\n",
    "test_orig = pd.read_csv('Data_Test02.csv', encoding='ISO-8859-1')\n",
    "FeatureNames = pd.read_csv('FeatureNames02.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = pd.read_csv('20190930_XGB01_TRN_DS.csv', encoding='ISO-8859-1')\n",
    "test_other_models = pd.read_csv('20190930_XGB01_TST_DS.csv', encoding='ISO-8859-1')\n",
    "\n",
    "train_other_models = train_other_models[['id','Price_Log_Pred','FOLD_NUM']]\n",
    "test_other_models = test_other_models[['id','Price_Log_Pred']]\n",
    "\n",
    "train = pd.merge(train_orig, train_other_models, on='id')\n",
    "test = pd.merge(test_orig, test_other_models, on='id')\n",
    "\n",
    "train['Price_Log'] = np.log10(train['Price']+1)\n",
    "train.hist(column='Price_Log')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "FeatureNames = FeatureNames['x'].values.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def cleaning_text(review, remove_stopwords=False, Lem=False):\n",
    "    review_text = BeautifulSoup(review, \"html.parser\").get_text()\n",
    "    review_text = re.sub('\\W',' ', review_text) # remove all the special characters\n",
    "    review_text = re.sub('[^\\w\\s]',' ', review_text) # Removing Punctuation\n",
    "    review_text = re.sub('\\s+[a-zA-Z]\\s+',' ', review_text) # remove all single characters\n",
    "    review_text = re.sub('\\^[a-zA-Z]\\s+',' ', review_text) # remove single characters from the start\n",
    "    review_text = re.sub('\\s+',' ', review_text) # Substituting multiple spaces with single space\n",
    "    review_text = re.sub('[^a-zA-Z]','', review_text)\n",
    "    words = review_text.lower().split()\n",
    "    if remove_stopwords:\n",
    "        stops = set(stopwords.words(\"english\"))\n",
    "        words = [w for w in words if not w in stops]\n",
    "    if Lem:\n",
    "        words = [stemmer.lemmatize(w) for w in words] # Lemmatization\n",
    "    review_text = (' '.join([word for word in words]))\n",
    "    return(review_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['Synopsis2'] = train['Synopsis'].apply(lambda x: cleaning_text(x,True,False))\n",
    "test['Synopsis2'] = test['Synopsis'].apply(lambda x: cleaning_text(x,True,False))\n",
    "\n",
    "train['Title2'] = train['Title'].apply(lambda x: cleaning_text(x,True,False))\n",
    "test['Title2'] = test['Title'].apply(lambda x: cleaning_text(x,True,False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_fatures_syn = 3000\n",
    "tokenizer_syn = Tokenizer(num_words=max_fatures_syn, split=' ')\n",
    "tokenizer_syn.fit_on_texts(train['Synopsis2'].values)\n",
    "\n",
    "word_index_syn = tokenizer_syn.word_index\n",
    "print('Found %s unique tokens.' % len(word_index_syn))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_fatures_tit = 3000\n",
    "tokenizer_tit = Tokenizer(num_words=max_fatures_tit, split=' ')\n",
    "tokenizer_tit.fit_on_texts(train['Title2'].values)\n",
    "\n",
    "word_index_tit = tokenizer_tit.word_index\n",
    "print('Found %s unique tokens.' % len(word_index_tit))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def nn_model(nlp_input_shape,meta_input_shape,max_fatures,dim,nlp_input_shape2,max_fatures2,dim2):\n",
    "    nlp_input = Input(shape=(nlp_input_shape,), name='nlp_input')\n",
    "    meta_input = Input(shape=(meta_input_shape,), name='meta_input')\n",
    "    nlp_input2 = Input(shape=(nlp_input_shape2,), name='nlp_input2')\n",
    "    \n",
    "    emb = Embedding(max_fatures, dim, input_length = nlp_input_shape)(nlp_input)\n",
    "    nlp_out1 = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(emb)\n",
    "    nlp_out = Conv1D(200, 5, padding='valid', activation='relu', strides=1)(nlp_out1)\n",
    "    nlp_out = GlobalMaxPool1D()(nlp_out)\n",
    "    \n",
    "    emb2 = Embedding(max_fatures2, dim2, input_length = nlp_input_shape2)(nlp_input2)\n",
    "    nlp_out12 = Conv1D(200, 3, padding='valid', activation='relu', strides=1)(emb2)\n",
    "    nlp_out2 = Conv1D(200, 5, padding='valid', activation='relu', strides=1)(nlp_out12)\n",
    "    nlp_out2 = GlobalMaxPool1D()(nlp_out2)\n",
    "    \n",
    "    x = concatenate([nlp_out, meta_input, nlp_out2])\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dense(100, activation='linear')(x)\n",
    "    x = Dropout(0.2)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dense(50, activation='linear')(x)\n",
    "    x = Dropout(0.1)(x)\n",
    "    x = BatchNormalization()(x)\n",
    "    x = Dense(1)(x)\n",
    "    \n",
    "    model = Model(inputs=[nlp_input , meta_input, nlp_input2], outputs=[x])\n",
    "    return(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "META_FEATURES_VARNAMES = FeatureNames\n",
    "\n",
    "fold_list = list(train.FOLD_NUM.unique())\n",
    "fold_list.sort()\n",
    "fold_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keras01_Models = []\n",
    "batch_size = 64\n",
    "for fold_num in fold_list:\n",
    "    print(\"Running for : \",fold_num)\n",
    "    \n",
    "    temp_train = train[train['FOLD_NUM'] != fold_num]\n",
    "    temp_val = train[train['FOLD_NUM'] == fold_num]\n",
    "\n",
    "    X_temp_train = tokenizer_syn.texts_to_sequences(temp_train['Synopsis2'].values)\n",
    "    X_temp_train = pad_sequences(X_temp_train, maxlen=100, padding='post')\n",
    "\n",
    "    X_temp_val = tokenizer_syn.texts_to_sequences(temp_val['Synopsis2'].values)\n",
    "    X_temp_val = pad_sequences(X_temp_val, maxlen=100, padding='post')\n",
    "\n",
    "    X_temp_train_meta = np.array(temp_train[META_FEATURES_VARNAMES])\n",
    "    X_temp_val_meta = np.array(temp_val[META_FEATURES_VARNAMES])\n",
    "    \n",
    "    X_temp_train2 = tokenizer_tit.texts_to_sequences(temp_train['Title2'].values)\n",
    "    X_temp_train2 = pad_sequences(X_temp_train2, maxlen=100, padding='post')\n",
    "\n",
    "    X_temp_val2 = tokenizer_tit.texts_to_sequences(temp_val['Title2'].values)\n",
    "    X_temp_val2 = pad_sequences(X_temp_val2, maxlen=100, padding='post')\n",
    "\n",
    "    Y_temp_train = temp_train['Price_Log'].values\n",
    "    Y_temp_val = temp_val['Price_Log'].values\n",
    "\n",
    "    print(\"Y train/val Shapes : \",Y_temp_train.shape,Y_temp_val.shape)\n",
    "    print(\"X train/val Shapes : \",X_temp_train.shape,X_temp_val.shape)\n",
    "\n",
    "    model = nn_model(X_temp_train.shape[1],X_temp_train_meta.shape[1],max_fatures_syn,300,X_temp_train2.shape[1],max_fatures_tit,300)\n",
    "    model.compile(loss = 'mean_squared_error', optimizer='adam',metrics = ['mean_squared_error'])\n",
    "    print(model.summary())\n",
    "\n",
    "    file_name = \"20190930_Keras01_Concat_Model_Weights_Fold_\"+str(fold_num)+'.h5'\n",
    "    final_path = file_name\n",
    "    print(\"Model Weights File Name : \",final_path)\n",
    "    keras01_Models.append(final_path)\n",
    "\n",
    "    es = EarlyStopping(mode='min',\n",
    "                       verbose=1,\n",
    "                       patience=10)\n",
    "    checkpointer = ModelCheckpoint(filepath=final_path,\n",
    "                                   mode='min',\n",
    "                                   verbose=1,\n",
    "                                   save_best_only=True)\n",
    "    reduce_lr = ReduceLROnPlateau(factor=0.2,\n",
    "                                  patience=3,\n",
    "                                  min_lr=0.0000001,\n",
    "                                  verbose=1)\n",
    "    history = model.fit([X_temp_train,X_temp_train_meta,X_temp_train2], Y_temp_train,\n",
    "                        epochs = 200,\n",
    "                        batch_size = batch_size,\n",
    "                        verbose = 1,\n",
    "                        validation_data = ([X_temp_val,X_temp_val_meta,X_temp_val2], Y_temp_val),\n",
    "                        callbacks = [es,checkpointer])\n",
    "\n",
    "    print(\"Loading Model for Prediction\")\n",
    "\n",
    "    loaded_model = load_model(final_path)\n",
    "    Y_temp_val_pred = loaded_model.predict([X_temp_val,X_temp_val_meta,X_temp_val2])\n",
    "\n",
    "    temp_val['Price_Log_Pred_Keras'] = Y_temp_val_pred\n",
    "\n",
    "    print('Completed for Fold - ',fold_num)\n",
    "    print('Val Data RMSLE : ',sqrt(mean_squared_error(temp_val['Price_Log'], temp_val['Price_Log_Pred_Keras'])))\n",
    "    \n",
    "    if fold_num == 1:\n",
    "        training_cv_preds_keras01 = temp_val\n",
    "        training_cv_preds_keras01.reset_index(drop = True, inplace = True)\n",
    "    else:\n",
    "        training_cv_preds_keras01 = pd.concat([training_cv_preds_keras01,temp_val])\n",
    "        training_cv_preds_keras01.reset_index(drop = True, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(keras01_Models)\n",
    "print(\"Keras 01 CV RMSLE = \",sqrt(mean_squared_error(training_cv_preds_keras01['Price_Log'], training_cv_preds_keras01['Price_Log_Pred_Keras'])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_cv_preds_keras01.to_csv(\"20190930_Keras01_TRN_CV_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "X_test = tokenizer_syn.texts_to_sequences(test['Synopsis2'].values)\n",
    "X_test = pad_sequences(X_test, maxlen=100, padding='post')\n",
    "X_test_meta = np.array(test[META_FEATURES_VARNAMES])\n",
    "X_test2 = tokenizer_tit.texts_to_sequences(test['Title2'].values)\n",
    "X_test2 = pad_sequences(X_test2, maxlen=100, padding='post')\n",
    "\n",
    "test_preds = np.zeros((test.shape[0],1))\n",
    "\n",
    "for fname in keras01_Models:\n",
    "    print(\"Running for : \",fname)\n",
    "    loaded_model = load_model(fname)\n",
    "    Y_test = loaded_model.predict([X_test,X_test_meta,X_test2])\n",
    "    test_preds = test_preds + Y_test\n",
    "    \n",
    "test_preds = test_preds / len(keras01_Models)\n",
    "\n",
    "test['Price_Log_Pred_Keras'] = test_preds\n",
    "test.to_csv(\"20190930_Keras01_TST_CV_DS.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission = pd.read_excel('Sample_Submission.xlsx', encoding='ISO-8859-1')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_preds2 = (10**test_preds) - 1\n",
    "pd.DataFrame(test_preds2).describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission['Price'] = test_preds2\n",
    "submission.to_excel('20190930_Keras01_DS.xlsx', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
